#' ---
#' title: Reproduce Appendix Figure A2 (Sentence-Level Correlations for Manifesto Application)
#' date: 2024-10-04
#' version: 1.0
#' ---

library(tidyverse)
set.seed(42)

## 1. Load and Clean Sentence-Level GPT-3 Labels ---------------------

ideology <- read_csv('application3-one-shot-gpt-3-ideology.csv')

policy <- read_csv('application3-one-shot-gpt-3-policy.csv')

# get the policy labels and probabilities
policy_labels <- policy |>  
  select(sentenceid, token1:token5) |>  
  pivot_longer(cols = token1:token5,
               values_to = 'policy') |>  
  select(-name)

policy_probabilities <- policy |> 
  select(sentenceid, prob1:prob5) |> 
  pivot_longer(cols = prob1:prob5,
               values_to = 'probability')

policy_labels$prob <- policy_probabilities$probability

# remove whitespace and capitalize. Replace blank with Neither
policy_labels <- policy_labels |>  
  mutate(policy = str_trim(policy)) |>  
  mutate(policy = str_to_title(policy)) |> 
  mutate(policy = if_else(policy == '', 'Neither', policy))

# sum the probabilities and keep the highest for each sentenceid
policy_labels <- policy_labels |> 
  group_by(sentenceid, policy) |> 
  summarize(policy_confidence = sum(prob)) |> 
  group_by(sentenceid) |> 
  slice_max(policy_confidence, n = 1)

gpt3_sentences <- policy |> 
  select(-`...1`) |> 
  left_join(policy_labels, by = 'sentenceid')


# Assign each sentence a Conservative/Liberal/Neither probability
ideology_labels <- ideology |>  
  select(sentenceid, token1:token5) |>  
  pivot_longer(cols = token1:token5,
               values_to = 'ideology') |>  
  select(-name)

ideology_probabilities <- ideology |> 
  select(sentenceid, prob1:prob5) |> 
  pivot_longer(cols = prob1:prob5,
               values_to = 'probability')

ideology_labels$prob <- ideology_probabilities$probability

# remove whitespace and capitalize. 
ideology_labels <- ideology_labels |>  
  mutate(ideology = str_trim(ideology)) |>  
  mutate(ideology = str_to_title(ideology)) |> 
  # Replace blank with Neither, Right/Conservatives with Conservative, and Left/Labour with Liberal
  mutate(ideology = case_when(ideology == '' ~ 'Neither', 
                              ideology == 'Conservatives' ~ 'Conservative',
                              ideology == 'Labour' ~ 'Liberal',
                              ideology == 'Right' ~ 'Conservative',
                              ideology == 'Left' ~ 'Liberal',
                              TRUE ~ ideology))

# sum the probabilities and pivot
ideology_labels <- ideology_labels |> 
  group_by(sentenceid, ideology) |> 
  summarize(prob = sum(prob)) |> 
  filter(ideology %in% c('Conservative', 'Neither', 'Liberal')) |> 
  ungroup() |> 
  pivot_wider(names_from = 'ideology',
              values_from = 'prob',
              values_fill = 0)

# merge with dataset
gpt3_sentences <- gpt3_sentences |> 
  left_join(ideology_labels, by = 'sentenceid')

# normalize probabilities to sum to 1
gpt3_sentences <- gpt3_sentences |> 
  mutate(p_sum = Conservative + Liberal + Neither) |> 
  mutate(Conservative = Conservative / p_sum,
         Liberal = Liberal / p_sum,
         Neither = Neither / p_sum) |> 
  select(-p_sum) |> 
  # ideology measure is P(Conservative) - P(Liberal)
  mutate(gpt3_ideology = Conservative - Liberal)

## 2. Compare sentence level estimates with crowd-coders from Benoit et al. (2016) ------------------

# load crowd codes
crowd_codes <- read_csv('application3-benoit-sentence-estimates.csv')

crowd_codes <- crowd_codes |> 
  filter(!is.na(code),
         source == 'Crowd',
         sentenceid %in% unique(gpt3_sentences$sentenceid)) |> 
  group_by(manifestoid, sentenceid, scale) |> 
  summarize(crowd_ideology = mean(code),
            num_coders = n()) |> 
  # keep only the scale where the most coders agreed
  group_by(manifestoid, sentenceid) |> 
  slice_max(num_coders, n = 1, with_ties = FALSE)

gpt3_sentences <- gpt3_sentences |> 
  left_join(crowd_codes)

## 3. Construct Figure A2 ------------

p <- gpt3_sentences |> 
  filter(policy %in% c('Social', 'Economic')) |> 
  filter(!is.na(num_coders)) |> 
  mutate(quantity_of_coders = if_else(num_coders >= 25, 
                                      'At Least 25 Crowd-Coders', 
                                      'All Sentences')) |> 
  ggplot(mapping = aes(x=crowd_ideology,
                       y=gpt3_ideology)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = 'lm') +
  facet_grid(policy ~ quantity_of_coders) +
  theme_bw() +
  labs(x = 'Crowd-Coded Position', y = 'GPT-3 Coded Position')

ggsave(filename = 'figure-a2.png',
       plot = p,
       height = 5, width = 8)

## Construct Table A4 --------------

gpt3_sentences |> 
  filter(policy %in% c('Social', 'Economic')) |> 
  filter(!is.na(num_coders)) |> 
  mutate(quantity_of_coders = if_else(num_coders >= 25, 
                                      'N > 25', 
                                      'N > 1')) |> 
  group_by(Policy = policy, `Number of Crowd-Coders` = quantity_of_coders) |> 
  summarize(Correlation = round(cor(crowd_ideology, gpt3_ideology), 2),
            `Number of Sentences` = n()) |> 
  arrange(`Number of Crowd-Coders`, Policy) |> 
  tinytable::tt() |> 
  tinytable::save_tt('table-a4.txt')



